from warnings import filterwarnings
filterwarnings('ignore')
import os
os.chdir("D:/deta frame all/training_set")
import pandas as pd
df = pd.read_csv('training_set.csv')
df.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1460 entries, 0 to 1459 Data columns (total 81 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 1460 non-null int64 1 MSSubClass 1460 non-null int64 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 LotArea 1460 non-null int64 5 Street 1460 non-null object 6 Alley 91 non-null object 7 LotShape 1460 non-null object 8 LandContour 1460 non-null object 9 Utilities 1460 non-null object 10 LotConfig 1460 non-null object 11 LandSlope 1460 non-null object 12 Neighborhood 1460 non-null object 13 Condition1 1460 non-null object 14 Condition2 1460 non-null object 15 BldgType 1460 non-null object 16 HouseStyle 1460 non-null object 17 OverallQual 1460 non-null int64 18 OverallCond 1460 non-null int64 19 YearBuilt 1460 non-null int64 20 YearRemodAdd 1460 non-null int64 21 RoofStyle 1460 non-null object 22 RoofMatl 1460 non-null object 23 Exterior1st 1460 non-null object 24 Exterior2nd 1460 non-null object 25 MasVnrType 1452 non-null object 26 MasVnrArea 1452 non-null float64 27 ExterQual 1460 non-null object 28 ExterCond 1460 non-null object 29 Foundation 1460 non-null object 30 BsmtQual 1423 non-null object 31 BsmtCond 1423 non-null object 32 BsmtExposure 1422 non-null object 33 BsmtFinType1 1423 non-null object 34 BsmtFinSF1 1460 non-null int64 35 BsmtFinType2 1422 non-null object 36 BsmtFinSF2 1460 non-null int64 37 BsmtUnfSF 1460 non-null int64 38 TotalBsmtSF 1460 non-null int64 39 Heating 1460 non-null object 40 HeatingQC 1460 non-null object 41 CentralAir 1460 non-null object 42 Electrical 1459 non-null object 43 1stFlrSF 1460 non-null int64 44 2ndFlrSF 1460 non-null int64 45 LowQualFinSF 1460 non-null int64 46 GrLivArea 1460 non-null int64 47 BsmtFullBath 1460 non-null int64 48 BsmtHalfBath 1460 non-null int64 49 FullBath 1460 non-null int64 50 HalfBath 1460 non-null int64 51 BedroomAbvGr 1460 non-null int64 52 KitchenAbvGr 1460 non-null int64 53 KitchenQual 1460 non-null object 54 TotRmsAbvGrd 1460 non-null int64 55 Functional 1460 non-null object 56 Fireplaces 1460 non-null int64 57 FireplaceQu 770 non-null object 58 GarageType 1379 non-null object 59 GarageYrBlt 1379 non-null float64 60 GarageFinish 1379 non-null object 61 GarageCars 1460 non-null int64 62 GarageArea 1460 non-null int64 63 GarageQual 1379 non-null object 64 GarageCond 1379 non-null object 65 PavedDrive 1460 non-null object 66 WoodDeckSF 1460 non-null int64 67 OpenPorchSF 1460 non-null int64 68 EnclosedPorch 1460 non-null int64 69 3SsnPorch 1460 non-null int64 70 ScreenPorch 1460 non-null int64 71 PoolArea 1460 non-null int64 72 PoolQC 7 non-null object 73 Fence 281 non-null object 74 MiscFeature 54 non-null object 75 MiscVal 1460 non-null int64 76 MoSold 1460 non-null int64 77 YrSold 1460 non-null int64 78 SaleType 1460 non-null object 79 SaleCondition 1460 non-null object 80 SalePrice 1460 non-null int64 dtypes: float64(3), int64(35), object(43) memory usage: 924.0+ KB
s = df.isna().sum()
s[s>0]
LotFrontage 259 Alley 1369 MasVnrType 8 MasVnrArea 8 BsmtQual 37 BsmtCond 37 BsmtExposure 38 BsmtFinType1 37 BsmtFinType2 38 Electrical 1 FireplaceQu 690 GarageType 81 GarageYrBlt 81 GarageFinish 81 GarageQual 81 GarageCond 81 PoolQC 1453 Fence 1179 MiscFeature 1406 dtype: int64
Weight ~ Remaining features
X = df.drop(labels=['Id','SalePrice'],axis=1)
Y = df[['SalePrice']]
X.head()
| MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | ... | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | Inside | ... | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal |
| 1 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | FR2 | ... | 0 | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal |
| 2 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | Inside | ... | 0 | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal |
| 3 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | Corner | ... | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml |
| 4 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | FR2 | ... | 0 | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal |
5 rows × 79 columns
Y.head()
| SalePrice | |
|---|---|
| 0 | 208500 |
| 1 | 181500 |
| 2 | 223500 |
| 3 | 140000 |
| 4 | 250000 |
from PM8 import catconsep
cat, con = catconsep(X)
cat
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
con
['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
# Catogorical features count plot
import matplotlib.pyplot as plt
for i in cat:
df[i].value_counts().plot(kind='bar',title=f'Countplot for {i}')
plt.show()
# Histogram for con variables
import seaborn as sns
for i in con:
sns.histplot(data=df,x=i,kde=True)
plt.title(f'Histogram for {i}')
plt.show()
# Correlation heatmap
df.corr()
| Id | MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | ... | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Id | 1.000000 | 0.011156 | -0.010601 | -0.033226 | -0.028365 | 0.012609 | -0.012713 | -0.021998 | -0.050298 | -0.005024 | ... | -0.029643 | -0.000477 | 0.002889 | -0.046635 | 0.001330 | 0.057044 | -0.006242 | 0.021172 | 0.000712 | -0.021917 |
| MSSubClass | 0.011156 | 1.000000 | -0.386347 | -0.139781 | 0.032628 | -0.059316 | 0.027850 | 0.040581 | 0.022936 | -0.069836 | ... | -0.012579 | -0.006100 | -0.012037 | -0.043825 | -0.026030 | 0.008283 | -0.007683 | -0.013585 | -0.021407 | -0.084284 |
| LotFrontage | -0.010601 | -0.386347 | 1.000000 | 0.426095 | 0.251646 | -0.059213 | 0.123349 | 0.088866 | 0.193458 | 0.233633 | ... | 0.088521 | 0.151972 | 0.010700 | 0.070029 | 0.041383 | 0.206167 | 0.003368 | 0.011200 | 0.007450 | 0.351799 |
| LotArea | -0.033226 | -0.139781 | 0.426095 | 1.000000 | 0.105806 | -0.005636 | 0.014228 | 0.013788 | 0.104160 | 0.214103 | ... | 0.171698 | 0.084774 | -0.018340 | 0.020423 | 0.043160 | 0.077672 | 0.038068 | 0.001205 | -0.014261 | 0.263843 |
| OverallQual | -0.028365 | 0.032628 | 0.251646 | 0.105806 | 1.000000 | -0.091932 | 0.572323 | 0.550684 | 0.411876 | 0.239666 | ... | 0.238923 | 0.308819 | -0.113937 | 0.030371 | 0.064886 | 0.065166 | -0.031406 | 0.070815 | -0.027347 | 0.790982 |
| OverallCond | 0.012609 | -0.059316 | -0.059213 | -0.005636 | -0.091932 | 1.000000 | -0.375983 | 0.073741 | -0.128101 | -0.046231 | ... | -0.003334 | -0.032589 | 0.070356 | 0.025504 | 0.054811 | -0.001985 | 0.068777 | -0.003511 | 0.043950 | -0.077856 |
| YearBuilt | -0.012713 | 0.027850 | 0.123349 | 0.014228 | 0.572323 | -0.375983 | 1.000000 | 0.592855 | 0.315707 | 0.249503 | ... | 0.224880 | 0.188686 | -0.387268 | 0.031355 | -0.050364 | 0.004950 | -0.034383 | 0.012398 | -0.013618 | 0.522897 |
| YearRemodAdd | -0.021998 | 0.040581 | 0.088866 | 0.013788 | 0.550684 | 0.073741 | 0.592855 | 1.000000 | 0.179618 | 0.128451 | ... | 0.205726 | 0.226298 | -0.193919 | 0.045286 | -0.038740 | 0.005829 | -0.010286 | 0.021490 | 0.035743 | 0.507101 |
| MasVnrArea | -0.050298 | 0.022936 | 0.193458 | 0.104160 | 0.411876 | -0.128101 | 0.315707 | 0.179618 | 1.000000 | 0.264736 | ... | 0.159718 | 0.125703 | -0.110204 | 0.018796 | 0.061466 | 0.011723 | -0.029815 | -0.005965 | -0.008201 | 0.477493 |
| BsmtFinSF1 | -0.005024 | -0.069836 | 0.233633 | 0.214103 | 0.239666 | -0.046231 | 0.249503 | 0.128451 | 0.264736 | 1.000000 | ... | 0.204306 | 0.111761 | -0.102303 | 0.026451 | 0.062021 | 0.140491 | 0.003571 | -0.015727 | 0.014359 | 0.386420 |
| BsmtFinSF2 | -0.005968 | -0.065649 | 0.049900 | 0.111170 | -0.059119 | 0.040229 | -0.049107 | -0.067759 | -0.072319 | -0.050117 | ... | 0.067898 | 0.003093 | 0.036543 | -0.029993 | 0.088871 | 0.041709 | 0.004940 | -0.015211 | 0.031706 | -0.011378 |
| BsmtUnfSF | -0.007940 | -0.140759 | 0.132644 | -0.002618 | 0.308159 | -0.136841 | 0.149040 | 0.181133 | 0.114442 | -0.495251 | ... | -0.005316 | 0.129005 | -0.002538 | 0.020764 | -0.012579 | -0.035092 | -0.023837 | 0.034888 | -0.041258 | 0.214479 |
| TotalBsmtSF | -0.015415 | -0.238518 | 0.392075 | 0.260833 | 0.537808 | -0.171098 | 0.391452 | 0.291066 | 0.363936 | 0.522396 | ... | 0.232019 | 0.247264 | -0.095478 | 0.037384 | 0.084489 | 0.126053 | -0.018479 | 0.013196 | -0.014969 | 0.613581 |
| 1stFlrSF | 0.010496 | -0.251758 | 0.457181 | 0.299475 | 0.476224 | -0.144203 | 0.281986 | 0.240379 | 0.344501 | 0.445863 | ... | 0.235459 | 0.211671 | -0.065292 | 0.056104 | 0.088758 | 0.131525 | -0.021096 | 0.031372 | -0.013604 | 0.605852 |
| 2ndFlrSF | 0.005590 | 0.307886 | 0.080177 | 0.050986 | 0.295493 | 0.028942 | 0.010308 | 0.140024 | 0.174561 | -0.137079 | ... | 0.092165 | 0.208026 | 0.061989 | -0.024358 | 0.040606 | 0.081487 | 0.016197 | 0.035164 | -0.028700 | 0.319334 |
| LowQualFinSF | -0.044230 | 0.046474 | 0.038469 | 0.004779 | -0.030429 | 0.025494 | -0.183784 | -0.062419 | -0.069071 | -0.064503 | ... | -0.025444 | 0.018251 | 0.061081 | -0.004296 | 0.026799 | 0.062157 | -0.003793 | -0.022174 | -0.028921 | -0.025606 |
| GrLivArea | 0.008273 | 0.074853 | 0.402797 | 0.263116 | 0.593007 | -0.079686 | 0.199010 | 0.287389 | 0.390857 | 0.208171 | ... | 0.247433 | 0.330224 | 0.009113 | 0.020643 | 0.101510 | 0.170205 | -0.002416 | 0.050240 | -0.036526 | 0.708624 |
| BsmtFullBath | 0.002289 | 0.003491 | 0.100949 | 0.158155 | 0.111098 | -0.054942 | 0.187599 | 0.119470 | 0.085310 | 0.649212 | ... | 0.175315 | 0.067341 | -0.049911 | -0.000106 | 0.023148 | 0.067616 | -0.023047 | -0.025361 | 0.067049 | 0.227122 |
| BsmtHalfBath | -0.020155 | -0.002333 | -0.007234 | 0.048046 | -0.040150 | 0.117821 | -0.038162 | -0.012337 | 0.026673 | 0.067418 | ... | 0.040161 | -0.025324 | -0.008555 | 0.035114 | 0.032121 | 0.020025 | -0.007367 | 0.032873 | -0.046524 | -0.016844 |
| FullBath | 0.005587 | 0.131608 | 0.198769 | 0.126031 | 0.550600 | -0.194149 | 0.468271 | 0.439046 | 0.276833 | 0.058543 | ... | 0.187703 | 0.259977 | -0.115093 | 0.035353 | -0.008106 | 0.049604 | -0.014290 | 0.055872 | -0.019669 | 0.560664 |
| HalfBath | 0.006784 | 0.177354 | 0.053532 | 0.014259 | 0.273458 | -0.060769 | 0.242656 | 0.183331 | 0.201444 | 0.004262 | ... | 0.108080 | 0.199740 | -0.095317 | -0.004972 | 0.072426 | 0.022381 | 0.001290 | -0.009050 | -0.010269 | 0.284108 |
| BedroomAbvGr | 0.037719 | -0.023438 | 0.263170 | 0.119690 | 0.101676 | 0.012980 | -0.070651 | -0.040581 | 0.102821 | -0.107355 | ... | 0.046854 | 0.093810 | 0.041570 | -0.024478 | 0.044300 | 0.070703 | 0.007767 | 0.046544 | -0.036014 | 0.168213 |
| KitchenAbvGr | 0.002951 | 0.281721 | -0.006069 | -0.017784 | -0.183882 | -0.087001 | -0.174800 | -0.149598 | -0.037610 | -0.081007 | ... | -0.090130 | -0.070091 | 0.037312 | -0.024600 | -0.051613 | -0.014525 | 0.062341 | 0.026589 | 0.031687 | -0.135907 |
| TotRmsAbvGrd | 0.027239 | 0.040380 | 0.352096 | 0.190015 | 0.427452 | -0.057583 | 0.095589 | 0.191740 | 0.280682 | 0.044316 | ... | 0.165984 | 0.234192 | 0.004151 | -0.006683 | 0.059383 | 0.083757 | 0.024763 | 0.036907 | -0.034516 | 0.533723 |
| Fireplaces | -0.019772 | -0.045569 | 0.266639 | 0.271364 | 0.396765 | -0.023820 | 0.147716 | 0.112581 | 0.249070 | 0.260011 | ... | 0.200019 | 0.169405 | -0.024822 | 0.011257 | 0.184530 | 0.095074 | 0.001409 | 0.046357 | -0.024096 | 0.466929 |
| GarageYrBlt | 0.000072 | 0.085072 | 0.070250 | -0.024947 | 0.547766 | -0.324297 | 0.825667 | 0.642277 | 0.252691 | 0.153484 | ... | 0.224577 | 0.228425 | -0.297003 | 0.023544 | -0.075418 | -0.014501 | -0.032417 | 0.005337 | -0.001014 | 0.486362 |
| GarageCars | 0.016570 | -0.040110 | 0.285691 | 0.154871 | 0.600671 | -0.185758 | 0.537850 | 0.420622 | 0.364204 | 0.224054 | ... | 0.226342 | 0.213569 | -0.151434 | 0.035765 | 0.050494 | 0.020934 | -0.043080 | 0.040522 | -0.039117 | 0.640409 |
| GarageArea | 0.017634 | -0.098672 | 0.344997 | 0.180403 | 0.562022 | -0.151521 | 0.478954 | 0.371600 | 0.373066 | 0.296970 | ... | 0.224666 | 0.241435 | -0.121777 | 0.035087 | 0.051412 | 0.061047 | -0.027400 | 0.027974 | -0.027378 | 0.623431 |
| WoodDeckSF | -0.029643 | -0.012579 | 0.088521 | 0.171698 | 0.238923 | -0.003334 | 0.224880 | 0.205726 | 0.159718 | 0.204306 | ... | 1.000000 | 0.058661 | -0.125989 | -0.032771 | -0.074181 | 0.073378 | -0.009551 | 0.021011 | 0.022270 | 0.324413 |
| OpenPorchSF | -0.000477 | -0.006100 | 0.151972 | 0.084774 | 0.308819 | -0.032589 | 0.188686 | 0.226298 | 0.125703 | 0.111761 | ... | 0.058661 | 1.000000 | -0.093079 | -0.005842 | 0.074304 | 0.060762 | -0.018584 | 0.071255 | -0.057619 | 0.315856 |
| EnclosedPorch | 0.002889 | -0.012037 | 0.010700 | -0.018340 | -0.113937 | 0.070356 | -0.387268 | -0.193919 | -0.110204 | -0.102303 | ... | -0.125989 | -0.093079 | 1.000000 | -0.037305 | -0.082864 | 0.054203 | 0.018361 | -0.028887 | -0.009916 | -0.128578 |
| 3SsnPorch | -0.046635 | -0.043825 | 0.070029 | 0.020423 | 0.030371 | 0.025504 | 0.031355 | 0.045286 | 0.018796 | 0.026451 | ... | -0.032771 | -0.005842 | -0.037305 | 1.000000 | -0.031436 | -0.007992 | 0.000354 | 0.029474 | 0.018645 | 0.044584 |
| ScreenPorch | 0.001330 | -0.026030 | 0.041383 | 0.043160 | 0.064886 | 0.054811 | -0.050364 | -0.038740 | 0.061466 | 0.062021 | ... | -0.074181 | 0.074304 | -0.082864 | -0.031436 | 1.000000 | 0.051307 | 0.031946 | 0.023217 | 0.010694 | 0.111447 |
| PoolArea | 0.057044 | 0.008283 | 0.206167 | 0.077672 | 0.065166 | -0.001985 | 0.004950 | 0.005829 | 0.011723 | 0.140491 | ... | 0.073378 | 0.060762 | 0.054203 | -0.007992 | 0.051307 | 1.000000 | 0.029669 | -0.033737 | -0.059689 | 0.092404 |
| MiscVal | -0.006242 | -0.007683 | 0.003368 | 0.038068 | -0.031406 | 0.068777 | -0.034383 | -0.010286 | -0.029815 | 0.003571 | ... | -0.009551 | -0.018584 | 0.018361 | 0.000354 | 0.031946 | 0.029669 | 1.000000 | -0.006495 | 0.004906 | -0.021190 |
| MoSold | 0.021172 | -0.013585 | 0.011200 | 0.001205 | 0.070815 | -0.003511 | 0.012398 | 0.021490 | -0.005965 | -0.015727 | ... | 0.021011 | 0.071255 | -0.028887 | 0.029474 | 0.023217 | -0.033737 | -0.006495 | 1.000000 | -0.145721 | 0.046432 |
| YrSold | 0.000712 | -0.021407 | 0.007450 | -0.014261 | -0.027347 | 0.043950 | -0.013618 | 0.035743 | -0.008201 | 0.014359 | ... | 0.022270 | -0.057619 | -0.009916 | 0.018645 | 0.010694 | -0.059689 | 0.004906 | -0.145721 | 1.000000 | -0.028923 |
| SalePrice | -0.021917 | -0.084284 | 0.351799 | 0.263843 | 0.790982 | -0.077856 | 0.522897 | 0.507101 | 0.477493 | 0.386420 | ... | 0.324413 | 0.315856 | -0.128578 | 0.044584 | 0.111447 | 0.092404 | -0.021190 | 0.046432 | -0.028923 | 1.000000 |
38 rows × 38 columns
plt.figure(figsize=(25,25))
sns.heatmap(df.corr(),annot=True,fmt='.2f')
plt.show()
# Scatterplot of every variable with Sales Price
for i in con:
if i!='SalePrice':
plt.scatter(df[i],df['SalePrice'])
plt.xlabel(i)
plt.ylabel('SalePrice')
plt.title(f'ScatterPlot for SalePrice vs {i}')
plt.show()
# Box plot of continuous features with SalePrice
for i in cat:
plt.figure(figsize=(16,8))
sns.boxplot(data=df,x=i,y='SalePrice')
plt.title(f'Boxplot for SalePrice vs {i}')
plt.show()
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
# Create numeric pipeline
num_pipe1 = Pipeline(steps=[('imputer',SimpleImputer(strategy='mean')),
('scaler',StandardScaler())])
# Create categorical pipeline
cat_pipe1 = Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),
('ord_enc',OrdinalEncoder())])
# Combine both pipeline with ColumnTransformer
pre1 = ColumnTransformer([('num',num_pipe1,con),
('cat',cat_pipe1,cat)])
X_pre = pre1.fit_transform(X)
X_pre
array([[ 0.07337496, -0.22937175, -0.20714171, ..., 2. ,
8. , 4. ],
[-0.87256276, 0.4519361 , -0.09188637, ..., 2. ,
8. , 4. ],
[ 0.07337496, -0.09311018, 0.07347998, ..., 2. ,
8. , 4. ],
...,
[ 0.30985939, -0.18395123, -0.14781027, ..., 2. ,
8. , 4. ],
[-0.87256276, -0.09311018, -0.08016039, ..., 2. ,
8. , 4. ],
[-0.87256276, 0.22483348, -0.05811155, ..., 2. ,
8. , 4. ]])
cols = pre1.get_feature_names_out()
cols
array(['num__MSSubClass', 'num__LotFrontage', 'num__LotArea',
'num__OverallQual', 'num__OverallCond', 'num__YearBuilt',
'num__YearRemodAdd', 'num__MasVnrArea', 'num__BsmtFinSF1',
'num__BsmtFinSF2', 'num__BsmtUnfSF', 'num__TotalBsmtSF',
'num__1stFlrSF', 'num__2ndFlrSF', 'num__LowQualFinSF',
'num__GrLivArea', 'num__BsmtFullBath', 'num__BsmtHalfBath',
'num__FullBath', 'num__HalfBath', 'num__BedroomAbvGr',
'num__KitchenAbvGr', 'num__TotRmsAbvGrd', 'num__Fireplaces',
'num__GarageYrBlt', 'num__GarageCars', 'num__GarageArea',
'num__WoodDeckSF', 'num__OpenPorchSF', 'num__EnclosedPorch',
'num__3SsnPorch', 'num__ScreenPorch', 'num__PoolArea',
'num__MiscVal', 'num__MoSold', 'num__YrSold', 'cat__MSZoning',
'cat__Street', 'cat__Alley', 'cat__LotShape', 'cat__LandContour',
'cat__Utilities', 'cat__LotConfig', 'cat__LandSlope',
'cat__Neighborhood', 'cat__Condition1', 'cat__Condition2',
'cat__BldgType', 'cat__HouseStyle', 'cat__RoofStyle',
'cat__RoofMatl', 'cat__Exterior1st', 'cat__Exterior2nd',
'cat__MasVnrType', 'cat__ExterQual', 'cat__ExterCond',
'cat__Foundation', 'cat__BsmtQual', 'cat__BsmtCond',
'cat__BsmtExposure', 'cat__BsmtFinType1', 'cat__BsmtFinType2',
'cat__Heating', 'cat__HeatingQC', 'cat__CentralAir',
'cat__Electrical', 'cat__KitchenQual', 'cat__Functional',
'cat__FireplaceQu', 'cat__GarageType', 'cat__GarageFinish',
'cat__GarageQual', 'cat__GarageCond', 'cat__PavedDrive',
'cat__PoolQC', 'cat__Fence', 'cat__MiscFeature', 'cat__SaleType',
'cat__SaleCondition'], dtype=object)
X_pre = pd.DataFrame(X_pre,columns=cols)
X_pre.head()
| num__MSSubClass | num__LotFrontage | num__LotArea | num__OverallQual | num__OverallCond | num__YearBuilt | num__YearRemodAdd | num__MasVnrArea | num__BsmtFinSF1 | num__BsmtFinSF2 | ... | cat__GarageType | cat__GarageFinish | cat__GarageQual | cat__GarageCond | cat__PavedDrive | cat__PoolQC | cat__Fence | cat__MiscFeature | cat__SaleType | cat__SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.073375 | -0.229372 | -0.207142 | 0.651479 | -0.517200 | 1.050994 | 0.878668 | 0.511418 | 0.575425 | -0.288653 | ... | 1.0 | 1.0 | 4.0 | 4.0 | 2.0 | 2.0 | 2.0 | 2.0 | 8.0 | 4.0 |
| 1 | -0.872563 | 0.451936 | -0.091886 | -0.071836 | 2.179628 | 0.156734 | -0.429577 | -0.574410 | 1.171992 | -0.288653 | ... | 1.0 | 1.0 | 4.0 | 4.0 | 2.0 | 2.0 | 2.0 | 2.0 | 8.0 | 4.0 |
| 2 | 0.073375 | -0.093110 | 0.073480 | 0.651479 | -0.517200 | 0.984752 | 0.830215 | 0.323060 | 0.092907 | -0.288653 | ... | 1.0 | 1.0 | 4.0 | 4.0 | 2.0 | 2.0 | 2.0 | 2.0 | 8.0 | 4.0 |
| 3 | 0.309859 | -0.456474 | -0.096897 | 0.651479 | -0.517200 | -1.863632 | -0.720298 | -0.574410 | -0.499274 | -0.288653 | ... | 5.0 | 2.0 | 4.0 | 4.0 | 2.0 | 2.0 | 2.0 | 2.0 | 8.0 | 0.0 |
| 4 | 0.073375 | 0.633618 | 0.375148 | 1.374795 | -0.517200 | 0.951632 | 0.733308 | 1.364570 | 0.463568 | -0.288653 | ... | 1.0 | 1.0 | 4.0 | 4.0 | 2.0 | 2.0 | 2.0 | 2.0 | 8.0 | 4.0 |
5 rows × 79 columns
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
model = LinearRegression()
sel = SequentialFeatureSelector(model,direction='forward',n_jobs=-1)
sel_features = sel.fit_transform(X_pre,Y)
sel_cols = sel.get_feature_names_out()
sel_cols
array(['num__MSSubClass', 'num__LotArea', 'num__OverallQual',
'num__OverallCond', 'num__YearBuilt', 'num__MasVnrArea',
'num__BsmtFinSF1', 'num__GrLivArea', 'num__BsmtFullBath',
'num__KitchenAbvGr', 'num__TotRmsAbvGrd', 'num__Fireplaces',
'num__GarageCars', 'num__WoodDeckSF', 'num__OpenPorchSF',
'num__ScreenPorch', 'num__PoolArea', 'num__YrSold', 'cat__Street',
'cat__LandContour', 'cat__Utilities', 'cat__Neighborhood',
'cat__BldgType', 'cat__HouseStyle', 'cat__RoofStyle',
'cat__RoofMatl', 'cat__Exterior1st', 'cat__MasVnrType',
'cat__ExterQual', 'cat__BsmtQual', 'cat__BsmtCond',
'cat__BsmtExposure', 'cat__HeatingQC', 'cat__KitchenQual',
'cat__Functional', 'cat__GarageCond', 'cat__PavedDrive',
'cat__Fence', 'cat__MiscFeature'], dtype=object)
imp_cols=[]
for i in sel_cols:
col = i.split('__')[1]
imp_cols.append(col)
imp_cols
['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'MasVnrArea', 'BsmtFinSF1', 'GrLivArea', 'BsmtFullBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'WoodDeckSF', 'OpenPorchSF', 'ScreenPorch', 'PoolArea', 'YrSold', 'Street', 'LandContour', 'Utilities', 'Neighborhood', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'MasVnrType', 'ExterQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'KitchenQual', 'Functional', 'GarageCond', 'PavedDrive', 'Fence', 'MiscFeature']
len(imp_cols)
39
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(X_pre,Y,test_size=0.2,random_state=60)
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain,ytrain)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
model.score(xtrain,ytrain)
0.9081153413377874
model.score(xtest,ytest)
0.5409575181518753
from PM8 import evaluate_model
evaluate_model(xtrain,ytrain,xtest,ytest,model)
Training Scores : MSE : 540106756.2510259 RMSE: 23240.196992517638 MAE : 16514.20199468709 R2 : 0.9081153413377874 ============================ Testing Scores : MSE : 3682018665.499123 RMSE: 60679.639628949044 MAE : 23545.628424657534 R2 : 0.5409575181518753
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model,xtrain,ytrain,cv=5,scoring='neg_mean_squared_error')
scores
array([-6.79773130e+08, -1.44153229e+12, -6.48970057e+26, -6.56465315e+08,
-7.94084463e+08])
scores.mean()
-1.2979401148287266e+26
mae_scores = cross_val_score(model,xtrain,ytrain,cv=5,scoring='neg_mean_absolute_error')
mae_scores
array([-1.87465380e+04, -1.48642894e+05, -1.66534572e+12, -1.70210765e+04,
-1.95015931e+04])
mae_scores.mean()
-333069185002.16016
r2_scores = cross_val_score(model,xtrain,ytrain,cv=5,scoring='r2')
r2_scores
array([ 9.04737819e-01, -2.71545925e+02, -1.30458568e+17, 8.88739342e-01,
8.67337101e-01])
r2_scores.mean()
-2.6091713694382916e+16
rmse_scores = cross_val_score(model,xtrain,ytrain,cv=5,scoring='neg_root_mean_squared_error')
rmse_scores
array([-2.60724592e+04, -1.20063828e+06, -2.54748907e+13, -2.56215791e+04,
-2.81795043e+04])
rmse_scores.mean()
-5094978400959.758
import numpy as np
alphas = np.arange(0.1,60,0.1)
print(alphas)
[ 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2. 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3. 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4. 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5. 5.1 5.2 5.3 5.4 5.5 5.6 5.7 5.8 5.9 6. 6.1 6.2 6.3 6.4 6.5 6.6 6.7 6.8 6.9 7. 7.1 7.2 7.3 7.4 7.5 7.6 7.7 7.8 7.9 8. 8.1 8.2 8.3 8.4 8.5 8.6 8.7 8.8 8.9 9. 9.1 9.2 9.3 9.4 9.5 9.6 9.7 9.8 9.9 10. 10.1 10.2 10.3 10.4 10.5 10.6 10.7 10.8 10.9 11. 11.1 11.2 11.3 11.4 11.5 11.6 11.7 11.8 11.9 12. 12.1 12.2 12.3 12.4 12.5 12.6 12.7 12.8 12.9 13. 13.1 13.2 13.3 13.4 13.5 13.6 13.7 13.8 13.9 14. 14.1 14.2 14.3 14.4 14.5 14.6 14.7 14.8 14.9 15. 15.1 15.2 15.3 15.4 15.5 15.6 15.7 15.8 15.9 16. 16.1 16.2 16.3 16.4 16.5 16.6 16.7 16.8 16.9 17. 17.1 17.2 17.3 17.4 17.5 17.6 17.7 17.8 17.9 18. 18.1 18.2 18.3 18.4 18.5 18.6 18.7 18.8 18.9 19. 19.1 19.2 19.3 19.4 19.5 19.6 19.7 19.8 19.9 20. 20.1 20.2 20.3 20.4 20.5 20.6 20.7 20.8 20.9 21. 21.1 21.2 21.3 21.4 21.5 21.6 21.7 21.8 21.9 22. 22.1 22.2 22.3 22.4 22.5 22.6 22.7 22.8 22.9 23. 23.1 23.2 23.3 23.4 23.5 23.6 23.7 23.8 23.9 24. 24.1 24.2 24.3 24.4 24.5 24.6 24.7 24.8 24.9 25. 25.1 25.2 25.3 25.4 25.5 25.6 25.7 25.8 25.9 26. 26.1 26.2 26.3 26.4 26.5 26.6 26.7 26.8 26.9 27. 27.1 27.2 27.3 27.4 27.5 27.6 27.7 27.8 27.9 28. 28.1 28.2 28.3 28.4 28.5 28.6 28.7 28.8 28.9 29. 29.1 29.2 29.3 29.4 29.5 29.6 29.7 29.8 29.9 30. 30.1 30.2 30.3 30.4 30.5 30.6 30.7 30.8 30.9 31. 31.1 31.2 31.3 31.4 31.5 31.6 31.7 31.8 31.9 32. 32.1 32.2 32.3 32.4 32.5 32.6 32.7 32.8 32.9 33. 33.1 33.2 33.3 33.4 33.5 33.6 33.7 33.8 33.9 34. 34.1 34.2 34.3 34.4 34.5 34.6 34.7 34.8 34.9 35. 35.1 35.2 35.3 35.4 35.5 35.6 35.7 35.8 35.9 36. 36.1 36.2 36.3 36.4 36.5 36.6 36.7 36.8 36.9 37. 37.1 37.2 37.3 37.4 37.5 37.6 37.7 37.8 37.9 38. 38.1 38.2 38.3 38.4 38.5 38.6 38.7 38.8 38.9 39. 39.1 39.2 39.3 39.4 39.5 39.6 39.7 39.8 39.9 40. 40.1 40.2 40.3 40.4 40.5 40.6 40.7 40.8 40.9 41. 41.1 41.2 41.3 41.4 41.5 41.6 41.7 41.8 41.9 42. 42.1 42.2 42.3 42.4 42.5 42.6 42.7 42.8 42.9 43. 43.1 43.2 43.3 43.4 43.5 43.6 43.7 43.8 43.9 44. 44.1 44.2 44.3 44.4 44.5 44.6 44.7 44.8 44.9 45. 45.1 45.2 45.3 45.4 45.5 45.6 45.7 45.8 45.9 46. 46.1 46.2 46.3 46.4 46.5 46.6 46.7 46.8 46.9 47. 47.1 47.2 47.3 47.4 47.5 47.6 47.7 47.8 47.9 48. 48.1 48.2 48.3 48.4 48.5 48.6 48.7 48.8 48.9 49. 49.1 49.2 49.3 49.4 49.5 49.6 49.7 49.8 49.9 50. 50.1 50.2 50.3 50.4 50.5 50.6 50.7 50.8 50.9 51. 51.1 51.2 51.3 51.4 51.5 51.6 51.7 51.8 51.9 52. 52.1 52.2 52.3 52.4 52.5 52.6 52.7 52.8 52.9 53. 53.1 53.2 53.3 53.4 53.5 53.6 53.7 53.8 53.9 54. 54.1 54.2 54.3 54.4 54.5 54.6 54.7 54.8 54.9 55. 55.1 55.2 55.3 55.4 55.5 55.6 55.7 55.8 55.9 56. 56.1 56.2 56.3 56.4 56.5 56.6 56.7 56.8 56.9 57. 57.1 57.2 57.3 57.4 57.5 57.6 57.7 57.8 57.9 58. 58.1 58.2 58.3 58.4 58.5 58.6 58.7 58.8 58.9 59. 59.1 59.2 59.3 59.4 59.5 59.6 59.7 59.8 59.9]
params = {'alpha':alphas}
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
model1 = Ridge()
gscv = GridSearchCV(model1,param_grid=params,cv=5,scoring='neg_mean_squared_error')
gscv.fit(xtrain,ytrain)
GridSearchCV(cv=5, estimator=Ridge(),
param_grid={'alpha': array([ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1,
1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2,
2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3,
3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4. , 4.1, 4.2, 4.3, 4.4,
4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1, 5.2, 5.3, 5.4, 5.5,
5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2, 6.3, 6.4, 6.5, 6.6,
6.7, 6.8, 6.9, 7. , 7.1, 7.2, 7.3, 7....
52.9, 53. , 53.1, 53.2, 53.3, 53.4, 53.5, 53.6, 53.7, 53.8, 53.9,
54. , 54.1, 54.2, 54.3, 54.4, 54.5, 54.6, 54.7, 54.8, 54.9, 55. ,
55.1, 55.2, 55.3, 55.4, 55.5, 55.6, 55.7, 55.8, 55.9, 56. , 56.1,
56.2, 56.3, 56.4, 56.5, 56.6, 56.7, 56.8, 56.9, 57. , 57.1, 57.2,
57.3, 57.4, 57.5, 57.6, 57.7, 57.8, 57.9, 58. , 58.1, 58.2, 58.3,
58.4, 58.5, 58.6, 58.7, 58.8, 58.9, 59. , 59.1, 59.2, 59.3, 59.4,
59.5, 59.6, 59.7, 59.8, 59.9])},
scoring='neg_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=Ridge(),
param_grid={'alpha': array([ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1,
1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2,
2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3,
3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4. , 4.1, 4.2, 4.3, 4.4,
4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1, 5.2, 5.3, 5.4, 5.5,
5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2, 6.3, 6.4, 6.5, 6.6,
6.7, 6.8, 6.9, 7. , 7.1, 7.2, 7.3, 7....
52.9, 53. , 53.1, 53.2, 53.3, 53.4, 53.5, 53.6, 53.7, 53.8, 53.9,
54. , 54.1, 54.2, 54.3, 54.4, 54.5, 54.6, 54.7, 54.8, 54.9, 55. ,
55.1, 55.2, 55.3, 55.4, 55.5, 55.6, 55.7, 55.8, 55.9, 56. , 56.1,
56.2, 56.3, 56.4, 56.5, 56.6, 56.7, 56.8, 56.9, 57. , 57.1, 57.2,
57.3, 57.4, 57.5, 57.6, 57.7, 57.8, 57.9, 58. , 58.1, 58.2, 58.3,
58.4, 58.5, 58.6, 58.7, 58.8, 58.9, 59. , 59.1, 59.2, 59.3, 59.4,
59.5, 59.6, 59.7, 59.8, 59.9])},
scoring='neg_mean_squared_error')Ridge()
Ridge()
gscv.best_params_
{'alpha': 59.900000000000006}
gscv.best_score_
-686772056.4636171
best_ridge = gscv.best_estimator_
best_ridge
Ridge(alpha=59.900000000000006)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Ridge(alpha=59.900000000000006)
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(X_pre,Y,test_size=0.2,random_state=60)
best_ridge.score(xtrain,ytrain)
0.9057501960448248
best_ridge.score(xtest,ytest)
0.598648242993741
evaluate_model(xtrain,ytrain,xtest,ytest,best_ridge)
Training Scores : MSE : 554009305.0643213 RMSE: 23537.40225820006 MAE : 16534.020117614124 R2 : 0.9057501960448248 ============================ Testing Scores : MSE : 3219276470.400496 RMSE: 56738.668211375 MAE : 23053.07247154682 R2 : 0.598648242993741
from sklearn.linear_model import Lasso
model3 = Lasso()
gscv2 = GridSearchCV(model3,param_grid=params,cv=5,scoring='neg_mean_squared_error')
gscv2.fit(xtrain,ytrain)
GridSearchCV(cv=5, estimator=Lasso(),
param_grid={'alpha': array([ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1,
1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2,
2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3,
3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4. , 4.1, 4.2, 4.3, 4.4,
4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1, 5.2, 5.3, 5.4, 5.5,
5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2, 6.3, 6.4, 6.5, 6.6,
6.7, 6.8, 6.9, 7. , 7.1, 7.2, 7.3, 7....
52.9, 53. , 53.1, 53.2, 53.3, 53.4, 53.5, 53.6, 53.7, 53.8, 53.9,
54. , 54.1, 54.2, 54.3, 54.4, 54.5, 54.6, 54.7, 54.8, 54.9, 55. ,
55.1, 55.2, 55.3, 55.4, 55.5, 55.6, 55.7, 55.8, 55.9, 56. , 56.1,
56.2, 56.3, 56.4, 56.5, 56.6, 56.7, 56.8, 56.9, 57. , 57.1, 57.2,
57.3, 57.4, 57.5, 57.6, 57.7, 57.8, 57.9, 58. , 58.1, 58.2, 58.3,
58.4, 58.5, 58.6, 58.7, 58.8, 58.9, 59. , 59.1, 59.2, 59.3, 59.4,
59.5, 59.6, 59.7, 59.8, 59.9])},
scoring='neg_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=Lasso(),
param_grid={'alpha': array([ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1,
1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2,
2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3,
3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4. , 4.1, 4.2, 4.3, 4.4,
4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1, 5.2, 5.3, 5.4, 5.5,
5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2, 6.3, 6.4, 6.5, 6.6,
6.7, 6.8, 6.9, 7. , 7.1, 7.2, 7.3, 7....
52.9, 53. , 53.1, 53.2, 53.3, 53.4, 53.5, 53.6, 53.7, 53.8, 53.9,
54. , 54.1, 54.2, 54.3, 54.4, 54.5, 54.6, 54.7, 54.8, 54.9, 55. ,
55.1, 55.2, 55.3, 55.4, 55.5, 55.6, 55.7, 55.8, 55.9, 56. , 56.1,
56.2, 56.3, 56.4, 56.5, 56.6, 56.7, 56.8, 56.9, 57. , 57.1, 57.2,
57.3, 57.4, 57.5, 57.6, 57.7, 57.8, 57.9, 58. , 58.1, 58.2, 58.3,
58.4, 58.5, 58.6, 58.7, 58.8, 58.9, 59. , 59.1, 59.2, 59.3, 59.4,
59.5, 59.6, 59.7, 59.8, 59.9])},
scoring='neg_mean_squared_error')Lasso()
Lasso()
gscv2.best_params_
{'alpha': 59.900000000000006}
gscv2.best_score_
-693694117.2987823
best_lasso = gscv2.best_estimator_
best_lasso
Lasso(alpha=59.900000000000006)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Lasso(alpha=59.900000000000006)
best_lasso.score(xtrain,ytrain)
0.9077103273468365
best_lasso.score(xtest,ytest)
0.5665219555347658
evaluate_model(xtrain,ytrain,xtest,ytest,best_lasso)
Training Scores : MSE : 542487466.9819968 RMSE: 23291.360350610626 MAE : 16494.60685908923 R2 : 0.9077103273468365 ============================ Testing Scores : MSE : 3476964145.843731 RMSE: 58965.7879269304 MAE : 23209.057498330316 R2 : 0.5665219555347658
import os
os.chdir("D:/deta frame all/testing_set")
import pandas as pd
df2 = pd.read_csv('testing_set.csv')
df2
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1461 | 20 | RH | 80.0 | 11622 | Pave | NaN | Reg | Lvl | AllPub | ... | 120 | 0 | NaN | MnPrv | NaN | 0 | 6 | 2010 | WD | Normal |
| 1 | 1462 | 20 | RL | 81.0 | 14267 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | 0 | NaN | NaN | Gar2 | 12500 | 6 | 2010 | WD | Normal |
| 2 | 1463 | 60 | RL | 74.0 | 13830 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | 0 | NaN | MnPrv | NaN | 0 | 3 | 2010 | WD | Normal |
| 3 | 1464 | 60 | RL | 78.0 | 9978 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | 0 | NaN | NaN | NaN | 0 | 6 | 2010 | WD | Normal |
| 4 | 1465 | 120 | RL | 43.0 | 5005 | Pave | NaN | IR1 | HLS | AllPub | ... | 144 | 0 | NaN | NaN | NaN | 0 | 1 | 2010 | WD | Normal |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1454 | 2915 | 160 | RM | 21.0 | 1936 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | 0 | NaN | NaN | NaN | 0 | 6 | 2006 | WD | Normal |
| 1455 | 2916 | 160 | RM | 21.0 | 1894 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | 0 | NaN | NaN | NaN | 0 | 4 | 2006 | WD | Abnorml |
| 1456 | 2917 | 20 | RL | 160.0 | 20000 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | 0 | NaN | NaN | NaN | 0 | 9 | 2006 | WD | Abnorml |
| 1457 | 2918 | 85 | RL | 62.0 | 10441 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | 0 | NaN | MnPrv | Shed | 700 | 7 | 2006 | WD | Normal |
| 1458 | 2919 | 60 | RL | 74.0 | 9627 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | 0 | NaN | NaN | NaN | 0 | 11 | 2006 | WD | Normal |
1459 rows × 80 columns
xnew = pre1.fit_transform(df2)
xnew
array([[-0.87471081, 0.55558736, 0.36392912, ..., 2. ,
8. , 4. ],
[-0.87471081, 0.60423927, 0.89786065, ..., 0. ,
8. , 4. ],
[ 0.06135085, 0.26367594, 0.80964587, ..., 2. ,
8. , 4. ],
...,
[-0.87471081, 4.44773966, 2.05514965, ..., 2. ,
8. , 0. ],
[ 0.64638939, -0.3201469 , 0.12552719, ..., 2. ,
8. , 4. ],
[ 0.06135085, 0.26367594, -0.03879049, ..., 2. ,
8. , 4. ]])
xnew = pd.DataFrame(xnew,columns=cols)
xnew
| num__MSSubClass | num__LotFrontage | num__LotArea | num__OverallQual | num__OverallCond | num__YearBuilt | num__YearRemodAdd | num__MasVnrArea | num__BsmtFinSF1 | num__BsmtFinSF2 | ... | cat__GarageType | cat__GarageFinish | cat__GarageQual | cat__GarageCond | cat__PavedDrive | cat__PoolQC | cat__Fence | cat__MiscFeature | cat__SaleType | cat__SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.874711 | 0.555587 | 0.363929 | -0.751101 | 0.400766 | -0.340945 | -1.072885 | -0.570108 | 0.063295 | 0.517348 | ... | 1.0 | 2.0 | 3.0 | 4.0 | 2.0 | 0.0 | 2.0 | 2.0 | 8.0 | 4.0 |
| 1 | -0.874711 | 0.604239 | 0.897861 | -0.054877 | 0.400766 | -0.439695 | -1.214908 | 0.041273 | 1.063392 | -0.297903 | ... | 1.0 | 2.0 | 3.0 | 4.0 | 2.0 | 0.0 | 2.0 | 0.0 | 8.0 | 4.0 |
| 2 | 0.061351 | 0.263676 | 0.809646 | -0.751101 | -0.497418 | 0.844059 | 0.678742 | -0.570108 | 0.773254 | -0.297903 | ... | 1.0 | 0.0 | 3.0 | 4.0 | 2.0 | 0.0 | 2.0 | 2.0 | 8.0 | 4.0 |
| 3 | 0.061351 | 0.458284 | 0.032064 | -0.054877 | 0.400766 | 0.876976 | 0.678742 | -0.456889 | 0.357829 | -0.297903 | ... | 1.0 | 0.0 | 3.0 | 4.0 | 2.0 | 0.0 | 2.0 | 2.0 | 8.0 | 4.0 |
| 4 | 1.465443 | -1.244533 | -0.971808 | 1.337571 | -0.497418 | 0.679475 | 0.394694 | -0.570108 | -0.387298 | -0.297903 | ... | 1.0 | 1.0 | 3.0 | 4.0 | 2.0 | 0.0 | 2.0 | 2.0 | 8.0 | 4.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1454 | 2.401505 | -2.314875 | -1.591330 | -1.447325 | 1.298950 | -0.044694 | -0.646813 | -0.570108 | -0.965376 | -0.297903 | ... | 1.0 | 2.0 | 3.0 | 4.0 | 2.0 | 0.0 | 2.0 | 2.0 | 8.0 | 4.0 |
| 1455 | 2.401505 | -2.314875 | -1.599808 | -1.447325 | -0.497418 | -0.044694 | -0.646813 | -0.570108 | -0.411477 | -0.297903 | ... | 4.0 | 2.0 | 3.0 | 4.0 | 2.0 | 0.0 | 2.0 | 2.0 | 8.0 | 0.0 |
| 1456 | -0.874711 | 4.447740 | 2.055150 | -0.751101 | 1.298950 | -0.373861 | 0.584059 | -0.570108 | 1.724994 | -0.297903 | ... | 5.0 | 2.0 | 3.0 | 4.0 | 2.0 | 0.0 | 2.0 | 2.0 | 8.0 | 0.0 |
| 1457 | 0.646389 | -0.320147 | 0.125527 | -0.751101 | -0.497418 | 0.679475 | 0.394694 | -0.570108 | -0.224645 | -0.297903 | ... | 1.0 | 2.0 | 3.0 | 4.0 | 2.0 | 0.0 | 2.0 | 2.0 | 8.0 | 4.0 |
| 1458 | 0.061351 | 0.263676 | -0.038790 | 0.641347 | -0.497418 | 0.712392 | 0.489377 | -0.037980 | 0.700719 | -0.297903 | ... | 1.0 | 0.0 | 3.0 | 4.0 | 2.0 | 0.0 | 2.0 | 2.0 | 8.0 | 4.0 |
1459 rows × 79 columns
preds = best_ridge.predict(xnew)
preds
array([[114581.18091902],
[164615.31439117],
[178213.89150915],
...,
[159915.01113896],
[120458.74718234],
[230229.83888659]])
df_final = df2[['Id']]
df_final
| Id | |
|---|---|
| 0 | 1461 |
| 1 | 1462 |
| 2 | 1463 |
| 3 | 1464 |
| 4 | 1465 |
| ... | ... |
| 1454 | 2915 |
| 1455 | 2916 |
| 1456 | 2917 |
| 1457 | 2918 |
| 1458 | 2919 |
1459 rows × 1 columns
df_final['SalePrice_pred']=preds
df_final
| Id | SalePrice_pred | |
|---|---|---|
| 0 | 1461 | 114581.180919 |
| 1 | 1462 | 164615.314391 |
| 2 | 1463 | 178213.891509 |
| 3 | 1464 | 189535.749247 |
| 4 | 1465 | 176445.253995 |
| ... | ... | ... |
| 1454 | 2915 | 67997.892217 |
| 1455 | 2916 | 50017.116016 |
| 1456 | 2917 | 159915.011139 |
| 1457 | 2918 | 120458.747182 |
| 1458 | 2919 | 230229.838887 |
1459 rows × 2 columns
sns.histplot(data=df_final,x='SalePrice_pred',kde=True)
<Axes: xlabel='SalePrice_pred', ylabel='Count'>
df_final.to_csv('Predictions_House_Price.csv',index=False)